{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "collapsed_sections": [
        "TnP0KMKJuTkP"
      ]
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "8aJYBZFNMyXc",
        "collapsed": true
      },
      "outputs": [],
      "source": [
        "try:\n",
        "    import evidently\n",
        "except:\n",
        "    !pip install git+https://github.com/evidentlyai/evidently.git"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import pandas as pd\n",
        "import numpy as np\n",
        "\n",
        "import io\n",
        "import os\n",
        "import zipfile\n",
        "\n",
        "import requests"
      ],
      "metadata": {
        "id": "UfuNPLwjO99K"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install implicit"
      ],
      "metadata": {
        "id": "8A_dH0K0082d"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# load data"
      ],
      "metadata": {
        "id": "KjF_x-wfcZOI"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "content = requests.get(\"http://files.grouplens.org/datasets/movielens/ml-100k.zip\").content\n",
        "\n",
        "with zipfile.ZipFile(io.BytesIO(content)) as arc:\n",
        "  train = arc.read(\"ml-100k/ua.base\").decode().split(\"\\n\")\n",
        "  test = arc.read(\"ml-100k/ua.test\").decode().split(\"\\n\")\n",
        "  movies = arc.read(\"ml-100k/u.item\").decode(encoding='latin-1').split(\"\\n\")\n",
        "  users = arc.read(\"ml-100k/u.user\").decode(encoding='latin-1').split(\"\\n\")"
      ],
      "metadata": {
        "id": "f1wLolXpM02U"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "columns = ['user_id', 'movie_id', 'rating', 'timestamp']\n",
        "\n",
        "data = [[x for x in e.split('\\t')] for e in train]\n",
        "train = pd.DataFrame(data, columns=columns).dropna().astype(int)\n",
        "\n",
        "data = [[x for x in e.split('\\t')] for e in test]\n",
        "test = pd.DataFrame(data, columns=columns).dropna().astype(int)\n",
        "\n",
        "columns = ['user_id', 'age', 'gender', 'occupation', 'zip_code']\n",
        "\n",
        "data = [[x for x in e.split('|')] for e in users]\n",
        "users = pd.DataFrame(data, columns=columns).dropna().astype({'user_id': int, 'age': int})\n",
        "\n",
        "genres = ['unknown', 'action', 'adventure', 'animation', 'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'noir',\n",
        "          'horror', 'musical', 'mystery', 'romance', 'sci-fi', 'thriller', 'war', 'western']\n",
        "columns = ['movie_id', 'title', 'year', '-', 'url'] + genres\n",
        "data = [[x for x in e.split('|')] for e in movies]\n",
        "movies = pd.DataFrame(data, columns=columns).dropna().astype({'movie_id': int})\n",
        "movies.drop(columns=['-', 'url'], inplace=True)\n",
        "movies[genres] = movies[genres].astype(int)\n",
        "movies['moive_age'] = (pd.to_datetime(movies.year).max() - pd.to_datetime(movies.year)).dt.days / 365"
      ],
      "metadata": {
        "id": "-V1w4P5LeV4X"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Utils"
      ],
      "metadata": {
        "id": "luiAKltK9ze3"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def transform_predictions(k, user_ids, item_ids):\n",
        "  return pd.DataFrame(\n",
        "      data=np.c_[np.repeat(user_ids, k), item_ids.flatten(), [i + 1 for i in range(k)] * len(user_ids)],\n",
        "      columns=['user_id', 'movie_id', 'rank']\n",
        "  )\n",
        "\n",
        "\n",
        "def prepare_prediction_df(k, user_ids, item_ids, true):\n",
        "  preds = transform_predictions(k, user_ids, item_ids)\n",
        "  preds = preds.merge(true, on=['user_id', 'movie_id'], how='outer')\n",
        "  preds['rank'] = preds.groupby('user_id')['rank'].transform(lambda x: x.fillna(x.max() + 1))\n",
        "  return preds\n",
        "\n",
        "\n",
        "def get_embeddings(model, movies_list, users_list, factors):\n",
        "  item_factors = pd.DataFrame(\n",
        "      data=np.column_stack((movies_list, model.item_factors)),\n",
        "      columns=['movie_id'] + [f'item_factor_{i+1}' for i in range(factors)]\n",
        "  )\n",
        "  user_factors = pd.DataFrame(\n",
        "      data=np.column_stack((users_list, model.user_factors)),\n",
        "      columns=['user_id'] + [f'user_factor_{i+1}' for i in range(factors)]\n",
        "  )\n",
        "  return item_factors, user_factors\n",
        "\n",
        "\n",
        "def get_full_df(df, item_factors, user_factors):\n",
        "  df = df.merge(movies, on=['movie_id'], how='left')\n",
        "  df = df.merge(users, on=['user_id'], how='left')\n",
        "  df = df.merge(item_factors, on=['movie_id'], how='left')\n",
        "  df = df.merge(user_factors, on=['user_id'], how='left')\n",
        "  return df"
      ],
      "metadata": {
        "id": "MqP6bLDv92hY"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Train model"
      ],
      "metadata": {
        "id": "J4Z6jBMZcwnJ"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "Let's get predictions from two models - ALS model and most common item recommender"
      ],
      "metadata": {
        "id": "XXba3z0w_y7p"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from implicit.cpu.als import AlternatingLeastSquares\n",
        "from scipy.sparse import csr_matrix\n",
        "pivot_table = train.pivot_table(index=['user_id'], columns=['movie_id'], values=\"rating\").fillna(0)\n",
        "\n",
        "als_model = AlternatingLeastSquares(factors=20, iterations=5, random_state=0)\n",
        "als_model.fit(csr_matrix(pivot_table))"
      ],
      "metadata": {
        "id": "-FQDsHEA3OKw"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "ids, scores = als_model.recommend(test.user_id.unique() - 1, csr_matrix(pivot_table.loc[test.user_id.unique()]), N=30, filter_already_liked_items=True)\n",
        "als_df = prepare_prediction_df(30, test.user_id.unique(), ids, test)"
      ],
      "metadata": {
        "id": "1kDXlE-FAEij"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "most_popular_top = list(train.movie_id.value_counts()[:30].index)\n",
        "rec_array = np.array([most_popular_top] * len(test.user_id.unique()))\n",
        "most_popular_df = prepare_prediction_df(30, test.user_id.unique(), rec_array, test)"
      ],
      "metadata": {
        "id": "j6z-vy8jArCq"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "item_factors, user_factors = get_embeddings(als_model, pivot_table.columns, pivot_table.index, 20)\n",
        "als_df = get_full_df(als_df, item_factors, user_factors)\n",
        "most_popular_df = get_full_df(most_popular_df, item_factors, user_factors)\n",
        "train = get_full_df(train, item_factors, user_factors)"
      ],
      "metadata": {
        "id": "3vuUgdDmAW5o"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "item_features = [f'item_factor_{i+1}' for i in range(20)]"
      ],
      "metadata": {
        "id": "VlKggz4OCysy"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Metrics"
      ],
      "metadata": {
        "id": "b-L1Odk0A2As"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from evidently.metrics import PrecisionTopKMetric\n",
        "from evidently.metrics import RecallTopKMetric\n",
        "from evidently.metrics import FBetaTopKMetric\n",
        "from evidently.metrics import MAPKMetric\n",
        "from evidently.metrics import NDCGKMetric\n",
        "from evidently.metrics import DiversityMetric\n",
        "from evidently.metrics import ItemBiasMetric\n",
        "from evidently.metrics import NoveltyMetric\n",
        "from evidently.metrics import PersonalizationMetric\n",
        "from evidently.metrics import PopularityBias\n",
        "from evidently.metrics import SerendipityMetric\n",
        "from evidently.metrics import UserBiasMetric\n",
        "from evidently.metrics import HitRateKMetric\n",
        "from evidently.metrics import MRRKMetric\n",
        "from evidently.metrics import RecCasesTable\n",
        "\n",
        "from evidently.pipeline.column_mapping import ColumnMapping\n",
        "from evidently.report import Report"
      ],
      "metadata": {
        "id": "vxU6s88ism0_"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "report = Report(metrics=[\n",
        "    PrecisionTopKMetric(k=5),\n",
        "    RecallTopKMetric(k=5),\n",
        "    FBetaTopKMetric(k=5),\n",
        "    MAPKMetric(k=5),\n",
        "    NDCGKMetric(k=5),\n",
        "    MRRKMetric(k=5),\n",
        "    HitRateKMetric(k=5),\n",
        "    DiversityMetric(k=5, item_features=item_features),\n",
        "    NoveltyMetric(k=5),\n",
        "    PersonalizationMetric(k=5),\n",
        "    SerendipityMetric(k=5, item_features=item_features),\n",
        "    PopularityBias(k=5),\n",
        "    ItemBiasMetric(k=5, column_name='moive_age'),\n",
        "    ItemBiasMetric(k=5, column_name='crime'),\n",
        "    UserBiasMetric(column_name='age'),\n",
        "    UserBiasMetric(column_name='gender'),\n",
        "    RecCasesTable(display_features=['action', 'adventure', 'animation'], item_num=10)\n",
        "\n",
        "\n",
        "])\n",
        "column_mapping = ColumnMapping(recommendations_type='rank', target='rating', prediction='rank', item_id='title', user_id='user_id')\n",
        "report.run(\n",
        "    reference_data=most_popular_df.dropna(subset=['title', 'user_id']).fillna(0),\n",
        "    current_data=als_df.dropna(subset=['title', 'user_id']).fillna(0),\n",
        "    column_mapping=column_mapping,\n",
        "    additional_data={'current_train_data': train.dropna(subset=['title', 'user_id'])}\n",
        "  )\n",
        "report"
      ],
      "metadata": {
        "id": "7KIQreI6tKEA"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Metric Preset"
      ],
      "metadata": {
        "id": "EuyiF9Jfp_rz"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from evidently.metric_preset import RecsysPreset"
      ],
      "metadata": {
        "id": "RM9W0WTwOJXc"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "report = Report(metrics=[\n",
        "    RecsysPreset(\n",
        "        k=5,\n",
        "        display_features=['action', 'adventure', 'animation'],\n",
        "        item_features=item_features,\n",
        "        user_bias_columns=['age', 'gender'],\n",
        "        item_bias_columns=['moive_age', 'crime'],\n",
        "    )\n",
        "])\n",
        "column_mapping = ColumnMapping(recommendations_type='rank', target='rating', prediction='rank', item_id='title', user_id='user_id')\n",
        "report.run(\n",
        "    reference_data=most_popular_df.dropna(subset=['title', 'user_id']).fillna(0),\n",
        "    current_data=als_df.dropna(subset=['title', 'user_id']).fillna(0),\n",
        "    column_mapping=column_mapping,\n",
        "    additional_data={'current_train_data': train.dropna(subset=['title', 'user_id'])}\n",
        "  )\n",
        "report"
      ],
      "metadata": {
        "id": "j04C9ICKq-30"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Tests"
      ],
      "metadata": {
        "id": "ViTG7pNzssDH"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from evidently.test_suite import TestSuite\n",
        "from evidently.tests import *"
      ],
      "metadata": {
        "id": "V6PJCvCVs3nZ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "tests = TestSuite(tests=[\n",
        "    TestPrecisionTopK(k=5),\n",
        "    TestRecallTopK(k=5),\n",
        "    TestFBetaTopK(k=5),\n",
        "    TestHitRateK(k=5),\n",
        "    TestMAPK(k=5),\n",
        "    TestMRRK(k=5),\n",
        "    TestNDCGK(k=5),\n",
        "    TestNovelty(k=5),\n",
        "    TestPersonalization(k=5),\n",
        "    TestSerendipity(k=5, item_features=item_features),\n",
        "    TestDiversity(k=5, item_features=item_features),\n",
        "    TestARP(k=5),\n",
        "    TestGiniIndex(k=5),\n",
        "    TestCoverage(k=5),\n",
        "])\n",
        "\n",
        "column_mapping = ColumnMapping(recommendations_type='rank', target='rating', prediction='rank', item_id='title', user_id='user_id')\n",
        "tests.run(\n",
        "    reference_data=most_popular_df.dropna(subset=['title', 'user_id']).fillna(0),\n",
        "    current_data=als_df.dropna(subset=['title', 'user_id']).fillna(0),\n",
        "    column_mapping=column_mapping,\n",
        "    additional_data={'current_train_data': train.dropna(subset=['title', 'user_id'])}\n",
        "  )\n",
        "tests"
      ],
      "metadata": {
        "id": "kfD5DuEzsFq0"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Test Preset"
      ],
      "metadata": {
        "id": "TnP0KMKJuTkP"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from evidently.test_preset import RecsysTestPreset"
      ],
      "metadata": {
        "id": "3kSGozrXudgQ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "tests = TestSuite(tests=[\n",
        "    RecsysTestPreset(k=5)\n",
        "])\n",
        "\n",
        "column_mapping = ColumnMapping(recommendations_type='rank', target='rating', prediction='rank', item_id='title', user_id='user_id')\n",
        "tests.run(\n",
        "    reference_data=most_popular_df.dropna(subset=['title', 'user_id']).fillna(0),\n",
        "    current_data=als_df.dropna(subset=['title', 'user_id']).fillna(0),\n",
        "    column_mapping=column_mapping,\n",
        "    additional_data={'current_train_data': train.dropna(subset=['title', 'user_id'])}\n",
        "  )\n",
        "tests"
      ],
      "metadata": {
        "id": "Ebz3Zl1UuEDt"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Metric and test defined for recommendation_type = 'scores' only"
      ],
      "metadata": {
        "id": "bI2q6Xfzv3u9"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# collect data with scores\n",
        "als_df_scores = pd.DataFrame(\n",
        "    {\n",
        "        'user_id': np.repeat(test.user_id.unique(), 30),\n",
        "        'movie_id': ids.flatten(),\n",
        "        'scores': scores.flatten()\n",
        "    }\n",
        ")\n",
        "als_df_scores = als_df_scores.merge(test, on=['user_id', 'movie_id'], how='outer')\n",
        "als_df_scores['scores'] = als_df_scores['scores'].fillna(0)"
      ],
      "metadata": {
        "id": "pPveHdLLuvHB"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from evidently.metrics import ScoreDistribution"
      ],
      "metadata": {
        "id": "pYKO5JNGxQhM"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "report = Report(metrics=[\n",
        "  ScoreDistribution(k=5)\n",
        "])\n",
        "column_mapping = ColumnMapping(recommendations_type='score', target='rating', prediction='scores', item_id='title', user_id='user_id')\n",
        "report.run(\n",
        "    reference_data=None,\n",
        "    current_data=als_df_scores.dropna(subset=['movie_id', 'user_id']).fillna(0),\n",
        "    column_mapping=column_mapping,\n",
        ")\n",
        "report"
      ],
      "metadata": {
        "id": "wNedeDcCwOUv"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "tests = TestSuite(tests=[TestScoreEntropy(k=5, gt=3)])\n",
        "column_mapping = ColumnMapping(recommendations_type='score', target='rating', prediction='scores', item_id='title', user_id='user_id')\n",
        "tests.run(\n",
        "    reference_data=None,\n",
        "    current_data=als_df_scores.dropna(subset=['movie_id', 'user_id']).fillna(0),\n",
        "    column_mapping=column_mapping,\n",
        ")\n",
        "tests"
      ],
      "metadata": {
        "id": "Zl-5kv5HwQ-j"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "E0CHmK7Uq5mC"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}